package au.com.acpfg.proteomics; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; import java.io.StringReader; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.knime.base.data.append.column.AppendedColumnRow; import org.knime.core.data.DataCell; import org.knime.core.data.DataColumnSpec; import org.knime.core.data.DataColumnSpecCreator; import org.knime.core.data.DataRow; import org.knime.core.data.DataTableSpec; import org.knime.core.data.RowIterator; import org.knime.core.data.def.BooleanCell; import org.knime.core.data.def.StringCell; import org.knime.core.node.BufferedDataContainer; import org.knime.core.node.BufferedDataTable; import org.knime.core.node.CanceledExecutionException; import org.knime.core.node.ExecutionContext; import org.knime.core.node.ExecutionMonitor; import org.knime.core.node.InvalidSettingsException; import org.knime.core.node.NodeModel; import org.knime.core.node.NodeSettingsRO; import org.knime.core.node.NodeSettingsWO; import org.knime.core.node.defaultnodesettings.SettingsModelBoolean; import org.knime.core.node.defaultnodesettings.SettingsModelString; /** * This is the model implementation of MinProteinList. * Uses a greedy set cover algorithm to identify the minimal set of proteins which can explain the observed peptides * * @author Andrew Cassin */ public class MinProteinListNodeModel extends NodeModel { static final String CFGKEY_PEPTIDES = "peptides"; static final String CFGKEY_PROTEIN = "protein"; static final String CFGKEY_ALGO = "algorithm"; static final String CFGKEY_SOLVER = "solver-program"; private final SettingsModelString m_peptide_column = new SettingsModelString(CFGKEY_PEPTIDES, "Peptides"); private final SettingsModelString m_accsn_column = new SettingsModelString(CFGKEY_PROTEIN, "Protein"); private final SettingsModelString m_algorithm = new SettingsModelString(CFGKEY_ALGO, "ILP: Minimum Set Cover"); private final SettingsModelString m_solver = new SettingsModelString(CFGKEY_SOLVER, "c:/cygwin/bin/glpsol.exe"); /** * Constructor for the node model. */ protected MinProteinListNodeModel() { super(1, 1); } /** * {@inheritDoc} */ @Override protected BufferedDataTable[] execute(final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception { int pep_idx = inData[0].getDataTableSpec().findColumnIndex(m_peptide_column.getStringValue()); int accsn_idx= inData[0].getDataTableSpec().findColumnIndex(m_accsn_column.getStringValue()); if (pep_idx < 0 || accsn_idx < 0 || pep_idx == accsn_idx) { throw new Exception("Illegal columns: "+m_peptide_column+" "+m_accsn_column+", re-configure the node!"); } DataTableSpec newSpec = new DataTableSpec(inData[0].getDataTableSpec(), make_output_spec()); BufferedDataContainer container = exec.createDataContainer(newSpec); RowIterator it = inData[0].iterator(); HashMap<String,String> prot2pep = new HashMap<String,String>(); HashMap<String,String> pep2lp = new HashMap<String,String>(); HashMap<String,String> prot2lp = new HashMap<String,String>(); HashMap<String,Set<String>> pep2protkeys = new HashMap<String,Set<String>>(); int peptide_idx = 1; int prot_idx = 1; while (it.hasNext()) { DataRow r = it.next(); DataCell pep_cell = r.getCell(pep_idx); DataCell accsn_cell= r.getCell(accsn_idx); // rows with missing cells cannot be processed (no missing values in ILP...) if (pep_cell.isMissing() || accsn_cell.isMissing()) { continue; } String peptides_as_csv = ((StringCell)pep_cell).getStringValue(); String protein_accsn = ((StringCell)accsn_cell).getStringValue(); if (peptides_as_csv.trim().length() < 1 || protein_accsn.trim().length() < 1) { throw new Exception("Must supply valid Protein ID (accession) and peptide list - no blank cells!"); } String[] peptides = peptides_as_csv.split(",\\s+"); prot2pep.put(protein_accsn, peptides_as_csv); for (String pep : peptides) { if (!pep2lp.containsKey(pep)) { String key = "_p"+peptide_idx+"_"; peptide_idx++; pep2lp.put(pep, key ); } } if (prot2lp.containsKey(protein_accsn)) { throw new Exception("Error at row "+r.getKey().getString()+": already seen peptides for protein ID "+protein_accsn); } String key = "_x"+prot_idx+"_"; prot_idx++; prot2lp.put(protein_accsn, key); for (String pep : peptides) { if (pep2protkeys.containsKey(pep)) { Set<String> s = pep2protkeys.get(pep); s.add(key); pep2protkeys.put(pep, s); } else { Set<String> s = new HashSet<String>(); s.add(key); pep2protkeys.put(pep, s); } } } // non-equal costs? HashMap<String,Double> costs = new HashMap<String,Double>(); // protein key -> weight (lower is better) if (m_algorithm.getStringValue().toLowerCase().contains("unique")) { for (String accsn : prot2lp.keySet()) { String pep_csv = prot2pep.get(accsn); String[] peptides = pep_csv.split(",\\s+"); int unique_cnt = 0; for (String pep : peptides) { Set<String> s = pep2protkeys.get(pep); if (s.size() == 1) unique_cnt++; } // NB: cost = 1 if there are no unique peptides if (unique_cnt > 0) { costs.put(prot2lp.get(accsn), new Double(1.0/(unique_cnt+1))); } } } // create a cplex-style file with the necessary ILP formulation File tmp_file = File.createTempFile("minprotset", ".lp"); PrintWriter pw = new PrintWriter(new FileWriter(tmp_file)); pw.println("minimize"); pw.print(" cost: "); Collection<String> c = prot2lp.values(); Iterator<String> it2 = c.iterator(); for (int i=0; i<c.size(); i++) { String prot_key = it2.next(); Double cost = costs.get(prot_key); if (cost != null) { pw.print(cost); pw.print(" "); } pw.print(prot_key); if (i<c.size()-1) { pw.print(" + "); } } pw.println(); pw.println("subject to"); for (String peptide : pep2lp.keySet()) { pw.print(pep2lp.get(peptide)+": "); Set<String> prots = pep2protkeys.get(peptide); if (prots == null || prots.size() < 1) throw new Exception("No proteins for peptide "+peptide); Iterator<String> it3 = prots.iterator(); for (int i=0; i<prots.size(); i++) { pw.print(it3.next()); pw.print(" "); if (i<prots.size()-1) pw.print("+ "); } pw.println(">= 1"); } pw.println("binary"); for (String prot_id : prot2lp.values()) { pw.println(" "+prot_id); } pw.println("end"); pw.close(); Logger l = Logger.getAnonymousLogger(); l.info("Created LP solver file: "+tmp_file.getAbsolutePath()); // run cplex to compute a solution... (hopefully!) List<String> args = new ArrayList<String>(); args.add(m_solver.getStringValue()); args.add("--min"); args.add("--lp"); args.add(tmp_file.getAbsolutePath()); args.add("-o"); File solution_file = File.createTempFile("minprotsol", ".txt"); args.add(solution_file.getAbsolutePath()); ProcessBuilder pb = new ProcessBuilder(args); pb.directory(tmp_file.getParentFile()); Process p = pb.start(); BufferedReader error_br = new BufferedReader(new InputStreamReader(p.getErrorStream())); BufferedReader out_br = new BufferedReader(new InputStreamReader(p.getInputStream())); out_br = new BufferedReader(new InputStreamReader(p.getInputStream())); StringBuffer stdout = new StringBuffer(10 * 1024); String line; while ((line = out_br.readLine()) != null) { stdout.append(line); stdout.append('\n'); } StringBuffer stderr = new StringBuffer(10 * 1024); while ((line = error_br.readLine()) != null) { stderr.append(line); stderr.append('\n'); } int exitCode = p.waitFor(); l.info("glpsol finished with code "+exitCode); // process the results from the ILP solver (GLPK specific and probably version specific) BufferedReader soln_rdr = new BufferedReader(new FileReader(solution_file)); Pattern pep_line_pattern = Pattern.compile("^\\s*\\d+\\s*(_p\\d+_)\\s+(\\d+)\\s+\\d+"); Pattern prot_line_pattern= Pattern.compile("^\\s*\\d+\\s*(_x\\d+_)\\s+\\S+\\s+(\\d+)\\s+"); HashMap<String,Integer> results_pep2cnt = new HashMap<String,Integer>(); HashMap<String,Integer> results_prot2cnt= new HashMap<String,Integer>(); while ((line = soln_rdr.readLine()) != null) { Matcher pep_matcher = pep_line_pattern.matcher(line); Matcher prot_matcher= prot_line_pattern.matcher(line); if (pep_matcher.find()) { String pep_key = pep_matcher.group(1); String pep_usage_cnt = pep_matcher.group(2); results_pep2cnt.put(pep_key, new Integer(pep_usage_cnt)); } else if (prot_matcher.find()) { String prot_key = prot_matcher.group(1); String prot_usage_cnt = prot_matcher.group(2); results_prot2cnt.put(prot_key, new Integer(prot_usage_cnt)); } } soln_rdr.close(); l.info("Got results from solver for "+results_pep2cnt.size()+" peptides, "+results_prot2cnt.size()+" proteins."); // 3. output TRUE for those rows which are part of the minimum set, FALSE otherwise it = inData[0].iterator(); while (it.hasNext()) { DataRow r = it.next(); String accsn = ((StringCell)r.getCell(accsn_idx)).getStringValue(); boolean is_min = (results_prot2cnt.get(prot2lp.get(accsn)).intValue() == 1); AppendedColumnRow new_r = new AppendedColumnRow(r, BooleanCell.get(is_min)); container.addRowToTable(new_r); } tmp_file.delete(); solution_file.delete(); container.close(); return new BufferedDataTable[] { container.getTable() }; } /** * {@inheritDoc} */ @Override protected void reset() { // NO-OP } /** * {@inheritDoc} */ @Override protected DataTableSpec[] configure(final DataTableSpec[] inSpecs) throws InvalidSettingsException { return new DataTableSpec[]{new DataTableSpec(inSpecs[0], make_output_spec())}; } private DataTableSpec make_output_spec() { DataColumnSpec cols[] = new DataColumnSpec[1]; cols[0] = new DataColumnSpecCreator("Is in minimum set?", BooleanCell.TYPE).createSpec(); return new DataTableSpec(cols); } /** * {@inheritDoc} */ @Override protected void saveSettingsTo(final NodeSettingsWO settings) { m_peptide_column.saveSettingsTo(settings); m_accsn_column.saveSettingsTo(settings); m_algorithm.saveSettingsTo(settings); m_solver.saveSettingsTo(settings); } /** * {@inheritDoc} */ @Override protected void loadValidatedSettingsFrom(final NodeSettingsRO settings) throws InvalidSettingsException { m_peptide_column.loadSettingsFrom(settings); m_accsn_column.loadSettingsFrom(settings); m_algorithm.loadSettingsFrom(settings); m_solver.loadSettingsFrom(settings); } /** * {@inheritDoc} */ @Override protected void validateSettings(final NodeSettingsRO settings) throws InvalidSettingsException { m_peptide_column.validateSettings(settings); m_accsn_column.validateSettings(settings); m_algorithm.validateSettings(settings); m_solver.validateSettings(settings); } /** * {@inheritDoc} */ @Override protected void loadInternals(final File internDir, final ExecutionMonitor exec) throws IOException, CanceledExecutionException { // TODO: generated method stub } /** * {@inheritDoc} */ @Override protected void saveInternals(final File internDir, final ExecutionMonitor exec) throws IOException, CanceledExecutionException { // TODO: generated method stub } }